Pros
Cons
Pros
Cons
https://www.tidyverse.org/ https://rviews.rstudio.com/2017/06/08/what-is-the-tidyverse/
With base R the functions are nested which means that we need to read the operations from inside out rather than in the order they are performed which using %>% “pipes” demonstrates nicely.
x <- c(0.109, 0.359, 0.63, 0.996, 0.515, 0.142, 0.017, 0.829, 0.907)
# base
round(exp(diff(log(x))), 1)
## [1] 3.3 1.8 1.6 0.5 0.3 0.1 48.8 1.1
# magrittr
library(magrittr)
x %>% log %>%
diff %>%
exp %>%
round(1)
## [1] 3.3 1.8 1.6 0.5 0.3 0.1 48.8 1.1
Usually tibble displays data nicer than base R (not such a problem with notebooks), the difference is when looking at long character string.
Modified from: http://www.onthelambda.com/2014/02/10/how-dplyr-replaced-my-most-common-r-idioms/
diamonds: A dataset containing the prices and other attributes of almost 54,000 diamonds. http://ggplot2.tidyverse.org/reference/diamonds.html
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.2
## ── Attaching packages ──────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.3.4 ✔ purrr 0.2.4
## ✔ tidyr 0.7.2 ✔ dplyr 0.7.4
## ✔ readr 1.1.1 ✔ stringr 1.2.0
## ✔ tibble 1.3.4 ✔ forcats 0.2.0
## Warning: package 'tidyr' was built under R version 3.4.2
## Warning: package 'purrr' was built under R version 3.4.2
## Warning: package 'dplyr' was built under R version 3.4.2
## ── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# base
plot(diamonds$carat, diamonds$price, col = diamonds$color,
pch = as.numeric(diamonds$cut))
# ggplot2
ggplot(diamonds, aes(carat, price, col = color, shape = cut)) +
geom_point()
Again, both work well with dplyr working faster and in fewer characters. The Base R helps us with column names if we use tab completion, while with dplyr we need to know the exact column names.
# base R
diamonds_base <- diamonds[diamonds$z > 0 & diamonds$cut != "Fair",]
# dplyr
diamonds_tdyv <- filter(diamonds,
z>0,
cut != "Fair")
# base R
diamonds_base <- diamonds_base[order(diamonds_base$price,
decreasing=TRUE), ]
# dplyr
diamonds_tdyv <- arrange(diamonds_tdyv, desc(price))
# base R
names <- c(colnames(diamonds_base)[grepl("^c", colnames(diamonds_base))], "price", "x", "y", "z")
diamonds_base <- diamonds_base[,names]
# dplyr
diamonds_tdyv <- select(diamonds_tdyv, starts_with("c"), price, x, y, z)
# base R
diamonds_base$mass <- diamonds_base$carat * 0.2
diamonds_base$size <- diamonds_base$x * diamonds_base$y * diamonds_base$z
# dplyr
diamonds_tdyv <- mutate(diamonds_tdyv, mass = carat * 0.2)
diamonds_tdyv <- mutate(diamonds_tdyv, size = x * y * z)
# base R
summary1 <- aggregate(price ~ color,
data=diamonds_base,
FUN=mean)
summary2 <- aggregate(price ~ color,
data=diamonds_base,
FUN=length)
summary_diamonds_base <- merge(summary1, summary2,
by="color")
# dplyr
by.color <- group_by(diamonds_tdyv, color)
summary_diamonds_tdyv <- summarise(by.color,
num_color = n(),
price = mean(price))
tidyverse really shows it’s selling point here as it is much shorter and much more readable.
# base R
names <- c(colnames(diamonds_base)[grepl("^c", colnames(diamonds_base))], "price", "x", "y", "z")
diamonds_base <- diamonds[diamonds$z>0 &
diamonds$cut!="Fair",
names]
diamonds_base <- diamonds_base[order(diamonds_base$carat,
decreasing=TRUE), ]
diamonds_base$mass <- diamonds_base$carat * 0.2
diamonds_base$size <- diamonds_base$x * diamonds_base$y * diamonds_base$z
summary1 <- aggregate(price ~ color,
data=diamonds_base,
FUN=mean)
summary2 <- aggregate(price ~ color,
data=diamonds_base,
FUN=length)
summary_diamonds_base <- merge(summary1, summary2,
by="color")
# tidyverse
diamonds %>%
filter(z>0, cut!="Fair") %>%
arrange(desc(carat)) %>%
select(starts_with("c"), price, x, y, z) %>%
mutate(mass = carat * 0.2, size = x * y * z) %>%
group_by(color) %>%
summarise(num_color = n(), price = mean(price))
# base
plot(diamonds_base$carat, diamonds_base$price, col = diamonds_base$color)
plot(diamonds_base$carat, diamonds_base$price, col = diamonds_base$clarity)
plot(diamonds_base$mass, diamonds_base$size, col = diamonds_base$carat)
# ggplot2
ggplot(diamonds, aes(carat, price, col = clarity, shape = cut)) +
geom_point()
ggplot(diamonds, aes(carat, price, col = color, shape = cut)) +
geom_point()
diamonds %>%
filter(z>0, cut!="Fair") %>%
arrange(desc(carat)) %>%
select(starts_with("c"), price, x, y, z) %>%
mutate(mass = carat * 0.2, size = x * y * z) %>%
group_by(color) %>%
ggplot(aes(mass, size, col=carat)) + geom_point()
Extra resources https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf http://www.significantdigits.org/2017/10/switching-from-base-r-to-tidyverse/ Table of base to tidyverse function conversion https://rviews.rstudio.com/2017/06/08/what-is-the-tidyverse/ [R for Data Science] (http://r4ds.had.co.nz/)
The source for this tutorial is on github.